---
title: "TrueLaughAnalysis"
output: html_document
---

## Load data and libraries

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
pacman ::p_load(readr,lme4,ggplot2,glmnet,tidyverse,groupdata2,cvms,caret,plyr,Metrics,boot,pROC)
setwd('~/Dropbox (Personal)/LaughterProject/TrueLaughs')
Data=read_delim('CleanData.csv',delim=";")
Predictors=Data[,c(3:41)]
Truth=Data$Truth
MeanJudgment=Data$TruthRatio
Samoa=Data$Samoa
```

## Modeling truth judgment as a function of acoustic features of laughters

- Cross-validated Lasso analysis (0.5 for elastic net)
- Only take selected features
- Stepwise cross-validated model selection repeated 100 times

```{r cars}
x   <- model.matrix(~.-1, data= Predictors)
y   <- MeanJudgment
y=as.numeric(unlist(y))
x=scale(x)
y=scale(y)

# Define cross-validated alpha selection
alphaslist<-seq(0,1,by=0.1)
foldslist<-seq(4,nrow(x))
pars=expand.grid(alphaslist,foldslist)

cvm=matrix(rep(0,nrow(pars)))

elasticnet<-lapply(1:nrow(pars), function(a){cv.glmnet(x, y, alpha=pars[a,1], family="gaussian", lambda.min.ratio=.001,nfolds = pars[a,2])})
for (i in 1:nrow(pars)) {cvm[i]=min(elasticnet[[i]]$cvm)}
n=which(cvm==min(cvm))

alpha=pars[n,1]
nfolds=pars[n,2]

# Run cross-validated elastic net with the chosen alpha (to choose lambda)
mod_cv <- cv.glmnet(x=x, y=y, family='gaussian', alpha=alpha, nfolds=nfolds)
coefs=as.data.frame(as.matrix(coef(mod_cv, mod_cv$lambda.1se)))
coefs$predictors<-rownames(coefs)
rownames(coefs) <- NULL
names(coefs)[1] <- "betas"
coefs=subset(coefs,betas!=0)
coefs1=coefs[order(abs(coefs$betas)),]
coefs1

## Run a proper model with the chosen predictors
## N.B. the chosen predictors are then inputed manually
## N.B. Exact list of predictors changes at each run (due to randomness of cross-validation process). 
## However, the final list selected in the process below is generally the same

#(Intercept)
#IntensityMean
#IntensityCV
#HNRMean
#HNRSD
#HNRMeanAbsoluteDeviation
#PitchLogMean

Data <- fold(Data, k = 5) %>% 
  arrange(.folds)

models <- c("TruthRatio~1")

CV1 <- cross_validate(Data, models, 
                     folds_col = '.folds', 
                     family='gaussian', 
                     REML = FALSE)

folds <- createFolds(1:nrow(Data), k=nfolds)
Data1=as.data.frame(scale(Data[,c(62,3:41)]))

(cv <- ldply(list("TruthRatio ~ 1 + IntensityCV",
          "TruthRatio ~ 1 + IntensityCV+PitchLogMean",
          "TruthRatio ~ 1 + IntensityCV+PitchLogMean+HNRMeanAbsoluteDeviation",
          "TruthRatio ~ 1 + IntensityCV+PitchLogMean+HNRMeanAbsoluteDeviation+HNRSD",
          "TruthRatio ~ 1 + IntensityCV+PitchLogMean+HNRMeanAbsoluteDeviation+HNRSD+HNRMean",
          "TruthRatio ~ 1 + IntensityCV+PitchLogMean+HNRMeanAbsoluteDeviation+HNRSD+HNRMean+IntensityMean"),
     function (f) {
       # `f` is the formula for each model to compare
       for (fold in folds) {
          train = Data1[-fold,]
          test = Data1[fold,]
          model <- lm(formula = f, data = train)
          
         return(cbind(
           aic=AIC(model),
           bic=BIC(model),
           training_rmse = rmse(train$TruthRatio, predict(model)),
           testing_rmse = rmse(test$TruthRatio, predict(model, test)),
           training_r2 = 1 - (sum((train$TruthRatio-predict(model) )^2)/sum((train$TruthRatio-mean(train$TruthRatio))^2)),
           testing_r2 = 1 - (sum((test$TruthRatio-predict(model, test) )^2)/sum((test$TruthRatio-mean(test$TruthRatio))^2)),
           f=f))
       }
     }) %>%
    mutate(training_rmse = as.numeric(levels(training_rmse)),
           testing_rmse = as.numeric(levels(testing_rmse))))

### Save predictions for the plot
Performance=NULL
folds <- createFolds(1:nrow(Data), k=5)
for (fold in folds) {
          train = Data[-fold,]
          test = Data[fold,]
          model <- lm(TruthRatio ~ 1 + IntensityCV+PitchLogMean+HNRMeanAbsoluteDeviation, data = train)
          Preds=data.frame(actual=test$TruthRatio,preds=predict(model,test),n=fold)
          
          if (exists("Performance")){
            Performance=rbind(Performance,Preds)
          } else {
            Performance=Preds
          }
       }

Performance=Performance[order(Performance$n),]
Performance$Truth='Spontaneous Laugh'
Performance$Truth[Data$Truth==0]='Volitional Laugh'

ggplot(Performance,aes(preds,actual))+geom_point(aes(color=Truth))+geom_smooth(method=lm,color='dark grey')+
  theme_classic()+xlab('Cross-validated predictions from the model')+ylab('Percent judging laugh as real')+
  theme(legend.title=element_blank(),legend.position=c(.8,.1),text = element_text(size=14))

### Calculating CIs
r2=rep(0,100)
for (i in 1:100){
folds <- createFolds(1:nrow(Data), k=5)
Performance=NULL
for (fold in folds) {
          train = Data[-fold,]
          test = Data[fold,]
          model <- lm(TruthRatio ~ 1 + IntensityCV+PitchLogMean+HNRMeanAbsoluteDeviation, data = train)
          Preds=data.frame(actual=test$TruthRatio,preds=predict(model,test),n=fold)
          
          if (exists("Performance")){
            Performance=rbind(Performance,Preds)
          } else {
            Performance=Preds
          }
}
r2[i]=1 - (sum((Performance$actual-Performance$preds )^2)/sum((Performance$actual-mean(Performance$actual))^2))
}

mean(r2)
range(r2)

### Beta values
m3=lm(TruthRatio ~ 1 + IntensityCV+PitchLogMean+HNRMeanAbsoluteDeviation,Data1)
summary(m3)

# Against local data
m3a=lm(TruthRatio ~ 1 + IntensityCV+PitchLogMean+HNRMeanAbsoluteDeviation,Data)
Data$Preds=predict(m3a)
m3b=lm(TruthRatio ~ 1 + IntensityCV+PitchLogMean+HNRMeanAbsoluteDeviation,Data1)
Data$Preds1=predict(m3b)

for (land in 42:62){
  
  scaly=scale(Data[,land])
  print(colnames(Data[,land]))
  
  print( 1 - (sum((Data[,land]-Data$Preds)^2) / sum((Data[,land]-mean(unlist(Data[,land])))^2)))
  #print(1 - (sum((scaly-Data$Preds1)^2)/sum((scaly-mean(scaly)^2))))
  
}



```


## Reproducing ground truth
```{r}
x   <- model.matrix(~.-1, data= Predictors)
y   <- Truth
y=as.numeric(unlist(y))
x=scale(x)

# Define cross-validated alpha selection
alphaslist<-seq(0,1,by=0.1)
foldslist<-seq(4,12)
pars=expand.grid(alphaslist,foldslist)

cvm=matrix(rep(0,nrow(pars)))
cvm1=matrix(rep(0,length(alphaslist)))

elasticnet1<-lapply(1:length(cvm1), function(a){cv.glmnet(x, y, alpha=alphaslist[a], family="binomial", lambda.min.ratio=.001,nfolds = 5)})

for (i in 1:nrow(pars)) {cvm[i]=min(elasticnet[[i]]$cvm)}

for (i in 1:length(alphaslist)) {cvm1[i]=min(elasticnet1[[i]]$cvm)}

n=which(cvm==min(cvm))

n1=which(cvm1==min(cvm1))

alpha=pars[n,1]
nfolds=pars[n,2]

alpha1=alphaslist[n1]

# Run cross-validated elastic net with the chosen alpha (to choose lambda)
mod_cv <- cv.glmnet(x=x, y=y, family='binomial', alpha=alpha1, nfolds=5)
coefs=as.data.frame(as.matrix(coef(mod_cv, mod_cv$lambda.1se)))
coefs$predictors<-rownames(coefs)
rownames(coefs) <- NULL
names(coefs)[1] <- "betas"
coefs=subset(coefs,betas!=0)
coefs1=coefs[order(abs(coefs$betas)),]
coefs1

Data=read_delim('CleanData.csv',delim=";")

Data1=as.data.frame(scale(Data[,3:41]))
#Data1$Truth='True Laugh'
#Data1$Truth[Data$Truth==0]='False Laugh'
Data1$Truth=Data$Truth
Data1$Truth=as.factor(Data1$Truth)


Data1 <- fold(Data1, k = 5) %>% 
  arrange(.folds)

models <- c("Truth~1",
            "Truth~1+MeanInterVoicingInterval",
            "Truth~1+MeanInterVoicingInterval+HNRIQR",
            "Truth~1+MeanInterVoicingInterval+HNRIQR+HNRMedian",
            "Truth~1+MeanInterVoicingInterval+HNRIQR+HNRMedian+PitchLogMedian",
            "Truth~1+MeanInterVoicingInterval+HNRIQR+PitchLogMedian"
            )

CV2 <- cross_validate(Data1, models, 
                     folds_col = '.folds', 
                     family='binomial')

CV2
glance_logistic <- function(m=NULL, cutoff = .5, testset=NULL, result = NULL, f = NULL) {
  
  # confusion matrix
  conf <- with(result, confusionMatrix(predicted, Truth, positive="1"))
  # AUC
  ROC <- with(result, roc(Truth, prediction))
  
  ret <- data.frame(Model = f,
                    Accuracy = conf$overall[c('Accuracy')],
                    Sensitivity = conf$byClass['Sensitivity'],
                    Specificity = conf$byClass['Specificity'],
                    NPV = conf$byClass['Neg Pred Value'],
                    PPV = conf$byClass['Pos Pred Value'],
                    Precision = conf$byClass['Precision'],
                    Recall = conf$byClass['Recall'],
                    AUC = auc(ROC),
                    n = nrow(result))
  rownames(ret) <- NULL
  return(ret)
}

#crossvalidation function

crossROC = function(formular,folds,data){
  
  results=data.frame() #create empty dataframe
  
  for (i in folds){
  
  train.subset=data[-(i),] #create the training subset
  test.subset=data[i,] #create the test subset (we actually don't use this)
  
  model = glm(formular,data=train.subset, family = binomial) #give it the model to crossvalidate
  
  Truth=test.subset$Truth #find the actual truth in the data
  prediction=predict(model,test.subset,type="response") #find the predicted value in percent
  predicted=ifelse(prediction<0.5,"0","1") #find the predicted class
  
  results= rbind(results, data.frame(Truth,prediction,predicted,n=i)) #make a new row in the results dataframe for each fold 
  
  }
  return (glance_logistic(result = results,f=formular)) #feed the cross validated model to the function that finds the ROC parameters
}

Res=NULL
for (dd in 1:100){
   folds=createFolds(1:nrow(Data1), k = 5, list = TRUE, returnTrain = FALSE) #create the folds 
   
   temp=crossROC("Truth ~ 1 + MeanInterVoicingInterval",folds,Data1)
   if (exists("Res")){Res=rbind(Res,temp)}else{Res=temp}
}

Res=NULL
for (dd in 1:100){
   folds=createFolds(1:nrow(Data1), k = 5, list = TRUE, returnTrain = FALSE) #create the folds 
   temp=crossROC("Truth~1+MeanInterVoicingInterval+HNRIQR+HNRMedian",folds,Data1)
    if (exists("Res")){Res=rbind(Res,temp)}else{Res=temp}
}
mean(Res$Accuracy)
range(Res$Accuracy)
mean(Res$Sensitivity)
range(Res$Sensitivity)
mean(Res$Specificity)
range(Res$Specificity)
mean(Res$AUC)
range(Res$AUC)

model4=glm(Truth~1+MeanInterVoicingInterval+HNRIQR+HNRMedian,Data1,family=binomial)
summary(model4)
```

## Crossing the models
```{r}
model5=glm(Truth~1 + IntensityCV+PitchLogMean+HNRMeanAbsoluteDeviation,Data1,family=binomial)
summary(model5)

Data1 <- fold(Data1, k = 5) %>% 
  arrange(.folds)

models <- c("Truth~1 + IntensityCV+PitchLogMean+HNRMeanAbsoluteDeviation")

CV3 <- cross_validate(Data1, models, 
                     folds_col = '.folds', 
                     family='binomial', 
                     REML = FALSE)
CV3

Res=NULL
for (dd in 1:100){
   folds=createFolds(1:nrow(Data1), k = 5, list = TRUE, returnTrain = FALSE) #create the folds 
   temp=crossROC("Truth~1+IntensityCV+PitchLogMean+HNRMeanAbsoluteDeviation",folds,Data1)
   if (exists("Res")){Res=rbind(Res,temp)}else{Res=temp}
}
mean(Res$Accuracy)
range(Res$Accuracy)
mean(Res$AUC)
range(Res$AUC)


model6=lm(TruthRatio~1+MeanInterVoicingInterval+HNRIQR+HNRMedian,Data1)
summary(model6)
Data <- fold(Data, k = 5) %>% 
  arrange(.folds)
models <- c("TruthRatio~1+MeanInterVoicingInterval+HNRIQR+HNRMedian")

CV4 <- cross_validate(Data, models, 
                     folds_col = '.folds', 
                     family='gaussian', 
                     REML = FALSE)
CV4
```


